Environment Setting¶

In [39]:
#Baic Setting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')

#Error Detection
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

#Dummy Variable Analysis
from sklearn.linear_model import LinearRegression
import seaborn as sns
import statsmodels.api as sm

#STL Decoposition
from statsmodels.tsa.seasonal import STL, seasonal_decompose

#ARIMA Model
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.graphics.tsaplots as sgt
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima.utils import ndiffs
from pmdarima import auto_arima
import pmdarima as pmd
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

#GARCH Model
from statsmodels.stats.diagnostic import het_arch
from arch import arch_model

#White Noise
from statsmodels.stats.diagnostic import acorr_ljungbox

#Rolling Forecasting
from datetime import timedelta

Import Data¶

In [40]:
CSV_file = '/Users/lokheilee/python/fyp/ARIMA/Gilead_Sciences_monthly.csv'
In [41]:
def parser(s):
    return datetime.strptime(s, '%Y-%m-%d')

raw_df = pd.read_csv(CSV_file, parse_dates=[0], index_col=0, date_parser=parser)

#plot the data
plt.figure(figsize=(16, 9))
plt.plot(raw_df['Close'])
plt.title(' Monthly close price')
for year in range(2005, 2025):
    plt.axvline(datetime(year,1,1), color='k', linestyle='--', alpha=0.2)

Dummy Variable Analysis¶

In [42]:
#dataframe setting
dummy_df = pd.read_csv(CSV_file)

# Convert Date to datetime
dummy_df['Date'] = pd.to_datetime(dummy_df['Date'])

# Create time period counter (1, 2, 3, ...)
dummy_df = dummy_df.sort_values('Date')
dummy_df['Time_Period'] = range(1, len(dummy_df) + 1)

# Ensure 'Close' is numeric
dummy_df['Close'] = pd.to_numeric(dummy_df['Close'], errors='coerce')

# Remove any rows with NaN values if they exist
dummy_df = dummy_df.dropna()

# Create monthly dummy variables
dummy_df['Month'] = dummy_df['Date'].dt.month
dummy_months = pd.get_dummies(dummy_df['Month'], prefix='Month')

# Combine time period with dummy variables
X = pd.concat([dummy_df[['Time_Period']], dummy_months.iloc[:, :-1]], axis=1)
y = dummy_df['Close']

# Convert to numpy arrays and ensure they're float type
X = np.asarray(X).astype(float)
y = np.asarray(y).astype(float)

# Add constant term
X = sm.add_constant(X)

# Perform regression
model = sm.OLS(y, X).fit()

# Get predictions
dva_predictions = model.predict(X)

# Calculate additional metrics
dva_residuals = y - dva_predictions
mse = np.mean(dva_residuals**2)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(dva_residuals))
mape = mean_absolute_percentage_error(y, dva_predictions) * 100

# Print detailed results
print("\nRegression Summary:")
print(model.summary())

# Create plots
plt.figure(figsize=(12, 6))
plt.plot(dummy_df['Date'], y, label='Actual', color='blue')
plt.plot(dummy_df['Date'], dva_predictions, label='Estimated', color='red', linestyle='--')
plt.title('Actual v Dummy Variable Estimated')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#Print MAPE
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Print the equation
print("\nRegression Equation:")
print("Close = ", end="")
print(f"{model.params[0]:.4f}", end="")
print(f" + {model.params[1]:.4f}*Time_Period", end="")
for i in range(2, len(model.params)):
    print(f" + {model.params[i]:.4f}*Month_{i-1}", end="")
print()

# Residual Analysis
res = pd.DataFrame({
    'date': dummy_df['Date'],
    'resid': dva_residuals
})
res.set_index('date', inplace=True)

# Calculate statistics
res.resid_mean = res.resid.mean()
res.resid_std = res.resid.std()
lower_bound = res.resid_mean - 3 * res.resid_std
upper_bound = res.resid_mean + 3 * res.resid_std

# Plot the residuals with dates
plt.figure(figsize=(12, 8))
plt.plot(res.index, res.resid)
plt.fill_between(res.index, lower_bound, upper_bound, color='green', alpha=0.3)
plt.title('Dummy Variable Residuals Analysis')
plt.xlabel('Date')
plt.ylabel('Residuals')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Anomaly Detection
anomalies = res.resid[(res.resid < lower_bound) | (res.resid > upper_bound)]
plt.figure(figsize=(12, 8))
plt.plot(dummy_df['Date'], dummy_df['Close'])
for date in anomalies.index:
    plt.axvline(date, color='r', linestyle='--', alpha=0.5)
plt.title(' Dummy Variable Time Series with Anomalies')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print('\nDates of Anomalies:')
print(anomalies.index.strftime('%Y-%m-%d').tolist())

# Print Write Noice
white_noise_dva = acorr_ljungbox(dva_residuals, lags = [12], return_df=True)
white_noise_dva

# Print the p-value from the Ljung-Box test
print("\n")
print(f"Ljung-Box test p-value: {white_noise_dva['lb_pvalue'][12]}")

# Interpret the result
p_value = white_noise_dva['lb_pvalue'][12]
if p_value > 0.05:
    print("\nInterpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests the residuals are independent (white noise).")
else:
    print("\nInterpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests the residuals are not independent (not white noise).")

# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(dva_residuals)

# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")

# Interpret LM test result
if lm_pval > 0.05:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests there is no ARCH effect in the residuals.")
else:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests there is ARCH effect present in the residuals.")
Regression Summary:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.752
Model:                            OLS   Adj. R-squared:                  0.739
Method:                 Least Squares   F-statistic:                     57.51
Date:                Wed, 26 Mar 2025   Prob (F-statistic):           6.53e-62
Time:                        23:56:39   Log-Likelihood:                -940.12
No. Observations:                 240   AIC:                             1906.
Df Residuals:                     227   BIC:                             1951.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.5181      3.159      1.747      0.082      -0.706      11.743
x1             0.3056      0.012     26.195      0.000       0.283       0.329
x2             0.2944      3.956      0.074      0.941      -7.501       8.090
x3            -0.9605      3.956     -0.243      0.808      -8.756       6.835
x4            -0.9965      3.956     -0.252      0.801      -8.791       6.798
x5            -1.0694      3.955     -0.270      0.787      -8.863       6.724
x6            -1.3762      3.955     -0.348      0.728      -9.170       6.417
x7            -0.9658      3.955     -0.244      0.807      -8.759       6.827
x8            -0.0614      3.955     -0.016      0.988      -7.854       7.731
x9             0.0794      3.955      0.020      0.984      -7.713       7.872
x10           -0.3503      3.954     -0.089      0.929      -8.142       7.442
x11            0.5925      3.954      0.150      0.881      -7.199       8.384
x12            1.1133      3.954      0.282      0.779      -6.678       8.905
==============================================================================
Omnibus:                       50.925   Durbin-Watson:                   0.074
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               79.640
Skew:                           1.215   Prob(JB):                     5.09e-18
Kurtosis:                       4.434   Cond. No.                     1.73e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.73e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Mean Absolute Percentage Error (MAPE): 26.72%

Regression Equation:
Close = 5.5181 + 0.3056*Time_Period + 0.2944*Month_1 + -0.9605*Month_2 + -0.9965*Month_3 + -1.0694*Month_4 + -1.3762*Month_5 + -0.9658*Month_6 + -0.0614*Month_7 + 0.0794*Month_8 + -0.3503*Month_9 + 0.5925*Month_10 + 1.1133*Month_11
Dates of Anomalies:
['2014-10-01', '2015-05-01', '2015-06-01', '2015-07-01']


Ljung-Box test p-value: 0.0

Interpretation:
Since p-value < 0.05, we reject the null hypothesis.
This suggests the residuals are not independent (not white noise).

=== ARCH LM Test Results ===
LM test statistic: 203.0241
LM test p-value: 3.775819944887155e-38

ARCH LM Test Interpretation:
Since p-value < 0.05, we reject the null hypothesis.
This suggests there is ARCH effect present in the residuals.

Additive STL¶

In [43]:
#Additive STL
#Import Data
df_stl = pd.read_csv(CSV_file, usecols=['Date', 'Close'])
df_stl = df_stl.dropna()
df_stl['Date'] = pd.to_datetime(df_stl['Date'])
df_stl.set_index('Date', inplace=True)

#Additive Decomposition
seasonal_decomp_add = seasonal_decompose(df_stl, model='additive', period=12)
df_stl['trend_add'] = seasonal_decomp_add.trend
df_stl['seasonal_add'] = seasonal_decomp_add.seasonal
df_stl['residual_add'] = seasonal_decomp_add.resid

#plot graph
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(12, 8))

# Plot original data
df_stl['Close'].plot(ax=ax1)
ax1.set_title('Original Time Series')
ax1.set_xlabel('Date')
ax1.set_ylabel('Close Price')

# Plot trend
seasonal_decomp_add.trend.plot(ax=ax2)
ax2.set_title('Trend')
ax2.set_xlabel('Date')
ax2.set_ylabel('Trend')

# Plot seasonal
seasonal_decomp_add.seasonal.plot(ax=ax3)
ax3.set_title('Seasonal')
ax3.set_xlabel('Date')
ax3.set_ylabel('Seasonal')

# Plot residual
seasonal_decomp_add.resid.plot(ax=ax4)
ax4.set_title('Residual')
ax4.set_xlabel('Date')
ax4.set_ylabel('Residual')

# Add main title
plt.suptitle('Additive Decomposition of Time Series', fontsize=16, y=1.02)

# Adjust layout
plt.tight_layout()
plt.show()

# Create diagnostic plots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# 1. Residual Plot over time
ax1.plot(df_stl.index, df_stl['residual_add'])
ax1.axhline(y=0, color='r', linestyle='--')
ax1.set_title('Residuals Over Time')
ax1.set_xlabel('Date')
ax1.set_ylabel('Residual')

# 2. Residual Histogram
sns.histplot(df_stl['residual_add'].dropna(), kde=True, ax=ax2)
ax2.set_title('Residual Distribution')
ax2.set_xlabel('Residual')
ax2.set_ylabel('Frequency')

# 3. Q-Q Plot
from scipy import stats
stats.probplot(df_stl['residual_add'].dropna(), dist="norm", plot=ax3)
ax3.set_title('Q-Q Plot')

# 4. Residual vs Fitted (Trend)
ax4.scatter(df_stl['trend_add'], df_stl['residual_add'])
ax4.axhline(y=0, color='r', linestyle='--')
ax4.set_title('Residual vs Fitted')
ax4.set_xlabel('Fitted Values (Trend)')
ax4.set_ylabel('Residuals')
plt.suptitle('Diagnostic Plots for Additive Decomposition', fontsize=16)
plt.tight_layout()
plt.show()

# Residual Analysis
res = pd.DataFrame(seasonal_decomp_add.resid)
res.columns = ['resid']

# Calculate statistics
res.resid_mean = res.resid.mean()
res.resid_std = res.resid.std()
lower_bound = res.resid_mean - 3 * res.resid_std
upper_bound = res.resid_mean + 3 * res.resid_std

# Plot the residuals
plt.figure(figsize=(12, 8))
plt.plot(res.resid)
plt.fill_between(res.resid.index, lower_bound, upper_bound, color='green', alpha=0.3)
plt.title('Additive Residuals Analysis')
plt.xlim(res.resid.index[0], res.resid.index[-1])
plt.show()

# Anomaly Detection
anomalies = res.resid[(res.resid < lower_bound) | (res.resid > upper_bound)]
plt.figure(figsize=(12, 8))
plt.plot(raw_df['Close'])  
for i in anomalies.index:
    plt.axvline(i, color='r', linestyle='--', alpha=0.5)
plt.title('Additive STL Time Series with Anomalies')
plt.show()

print(f'Dates of Anomalies: {anomalies.index}')

#forcastin by addictive
estimated_add = df_stl['trend_add'] + df_stl['seasonal_add']
plt.figure(figsize=(16, 9))
plt.plot(raw_df['Close'], label='Original')
plt.plot(estimated_add, label='Estimated_Add')
plt.title('Original vs Estimated(Additive STL)')
plt.legend()
for year in range(2005, 2025):
    plt.axvline(datetime(year,1,1), color='k', linestyle='--', alpha=0.2)
plt.show()

# mean absolute percentage error and root mean square error
mape = np.mean(np.abs((raw_df['Close'] - estimated_add) / raw_df['Close'])) * 100
rmse = np.mean((raw_df['Close'] - estimated_add) ** 2) ** 0.5
print(f'Mean Absolute Percentage Error: {mape}')
print(f'Root Mean Square Error: {rmse}')

# Print Write Noice
white_noise_add = acorr_ljungbox(df_stl['residual_add'].dropna(), lags = [12], return_df=True)
white_noise_add

# Print the p-value from the Ljung-Box test
print("\n")
print(f"Ljung-Box test p-value: {white_noise_add['lb_pvalue'][12]}")

# Interpret the result
p_value = white_noise_add['lb_pvalue'][12]
if p_value > 0.05:
    print("\nInterpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests the residuals are independent (white noise).")
else:
    print("\nInterpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests the residuals are not independent (not white noise).")
    
# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(df_stl['residual_add'].dropna())

# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")

# Interpret LM test result
if lm_pval > 0.05:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests there is no ARCH effect in the residuals.")
else:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests there is ARCH effect present in the residuals.")
Dates of Anomalies: DatetimeIndex(['2020-04-01', '2022-11-01', '2024-05-01'], dtype='datetime64[ns]', name='Date', freq=None)
Mean Absolute Percentage Error: 5.947847323694597
Root Mean Square Error: 3.382089684320981


Ljung-Box test p-value: 1.0220833073493274e-27

Interpretation:
Since p-value < 0.05, we reject the null hypothesis.
This suggests the residuals are not independent (not white noise).

=== ARCH LM Test Results ===
LM test statistic: 55.7577
LM test p-value: 2.2790094839537187e-08

ARCH LM Test Interpretation:
Since p-value < 0.05, we reject the null hypothesis.
This suggests there is ARCH effect present in the residuals.
In [44]:
#GARCH Model
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
plot_acf(df_stl['residual_add'].dropna(), lags=40, zero=False, ax=ax2, title='Autocorrelation (For MA term)')
plot_pacf(df_stl['residual_add'].dropna(), lags=40, zero=False, ax=ax1, title = 'Partial Autocorrelation (For AR term)')
plt.show()

mdl_garch = arch_model(df_stl['residual_add'].dropna(), vol = 'GARCH', p = 1, q = 3)
garch_fit = mdl_garch.fit()
print(garch_fit.summary())

garch_std_resid = pd.Series(garch_fit.resid / garch_fit.conditional_volatility)
fig = plt.figure(figsize = (15, 8))

# Residual
garch_std_resid.plot(ax = fig.add_subplot(3,1,1), title = 'GARCH Standardized-Residual', legend = False)

# ACF/PACF
sgt.plot_acf(garch_std_resid, zero = False, lags = 40, ax=fig.add_subplot(3,2,3))
sgt.plot_pacf(garch_std_resid, zero = False, lags = 40, ax=fig.add_subplot(3,2,4))

# QQ-Plot & Norm-Dist
sm.qqplot(garch_std_resid, line='s', ax=fig.add_subplot(3,2,5)) 
plt.title("QQ Plot")
fig.add_subplot(3,2,6).hist(garch_std_resid, bins = 12)
plt.title("Histogram")

plt.tight_layout()
plt.show()

# Print Write Noice
white_noise_add = acorr_ljungbox(garch_std_resid, lags = [12], return_df=True)
white_noise_add

# Print the p-value from the Ljung-Box test
print("\n")
print(f"Ljung-Box test p-value: {white_noise_add['lb_pvalue'][12]}")

# Interpret the result
p_value = white_noise_add['lb_pvalue'][12]
if p_value > 0.05:
    print("\nInterpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests the residuals are independent (white noise).")
else:
    print("\nInterpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests the residuals are not independent (not white noise).")
    
# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(garch_std_resid)

# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")

# Interpret LM test result
if lm_pval > 0.05:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests there is no ARCH effect in the residuals.")
else:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests there is ARCH effect present in the residuals.")
Iteration:      1,   Func. Count:      8,   Neg. LLF: 352282.5623818712
Iteration:      2,   Func. Count:     16,   Neg. LLF: 2099.191642868249
Iteration:      3,   Func. Count:     24,   Neg. LLF: 590.1877487688679
Iteration:      4,   Func. Count:     33,   Neg. LLF: 534.736507352731
Iteration:      5,   Func. Count:     41,   Neg. LLF: 534.1803505112179
Iteration:      6,   Func. Count:     49,   Neg. LLF: 534.8150314818827
Iteration:      7,   Func. Count:     57,   Neg. LLF: 533.8966257897875
Iteration:      8,   Func. Count:     64,   Neg. LLF: 533.8765961682659
Iteration:      9,   Func. Count:     71,   Neg. LLF: 533.8765508268564
Iteration:     10,   Func. Count:     78,   Neg. LLF: 533.8765482164501
Iteration:     11,   Func. Count:     84,   Neg. LLF: 533.8765478170626
Optimization terminated successfully    (Exit mode 0)
            Current function value: 533.8765482164501
            Iterations: 11
            Function evaluations: 84
            Gradient evaluations: 11
                     Constant Mean - GARCH Model Results                      
==============================================================================
Dep. Variable:           residual_add   R-squared:                       0.000
Mean Model:             Constant Mean   Adj. R-squared:                  0.000
Vol Model:                      GARCH   Log-Likelihood:               -533.877
Distribution:                  Normal   AIC:                           1079.75
Method:            Maximum Likelihood   BIC:                           1100.33
                                        No. Observations:                  228
Date:                Wed, Mar 26 2025   Df Residuals:                      227
Time:                        23:56:41   Df Model:                            1
                                Mean Model                                
==========================================================================
                 coef    std err          t      P>|t|    95.0% Conf. Int.
--------------------------------------------------------------------------
mu            -0.1516      0.126     -1.201      0.230 [ -0.399,9.578e-02]
                              Volatility Model                             
===========================================================================
                 coef    std err          t      P>|t|     95.0% Conf. Int.
---------------------------------------------------------------------------
omega          0.2276      0.159      1.428      0.153 [-8.483e-02,  0.540]
alpha[1]       0.4088  9.738e-02      4.198  2.695e-05    [  0.218,  0.600]
beta[1]        0.1408      0.173      0.815      0.415    [ -0.198,  0.479]
beta[2]        0.1771      0.376      0.470      0.638    [ -0.561,  0.915]
beta[3]        0.2733      0.417      0.655      0.513    [ -0.545,  1.091]
===========================================================================

Covariance estimator: robust

Ljung-Box test p-value: 7.081596215629725e-17

Interpretation:
Since p-value < 0.05, we reject the null hypothesis.
This suggests the residuals are not independent (not white noise).

=== ARCH LM Test Results ===
LM test statistic: 5.4971
LM test p-value: 0.855599747400229

ARCH LM Test Interpretation:
Since p-value > 0.05, we fail to reject the null hypothesis.
This suggests there is no ARCH effect in the residuals.

Multiciative STL¶

In [45]:
#Multiplicative Decomposition
from statsmodels.tsa.seasonal import STL, seasonal_decompose
seasonal_decomp_multi = seasonal_decompose(df_stl['Close'], model='multiplicative', period=12)
df_stl['trend_multi'] = seasonal_decomp_multi.trend
df_stl['seasonal_multi'] = seasonal_decomp_multi.seasonal
df_stl['residual_multi'] = seasonal_decomp_multi.resid

#plot graph
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(12, 8))

# Plot original data
df_stl['Close'].plot(ax=ax1)
ax1.set_title('Original Time Series')
ax1.set_xlabel('Date')
ax1.set_ylabel('Close Price')

# Plot trend
seasonal_decomp_multi.trend.plot(ax=ax2)
ax2.set_title('Trend')
ax2.set_xlabel('Date')
ax2.set_ylabel('Trend')

# Plot seasonal
seasonal_decomp_multi.seasonal.plot(ax=ax3)
ax3.set_title('Seasonal')
ax3.set_xlabel('Date')
ax3.set_ylabel('Seasonal')

# Plot residual
seasonal_decomp_multi.resid.plot(ax=ax4)
ax4.set_title('Residual')
ax4.set_xlabel('Date')
ax4.set_ylabel('Residual')

# Add main title
plt.suptitle('Multipliacative Decomposition of Time Series', fontsize=16, y=1.02)

# Adjust layout
plt.tight_layout()
plt.show()

# Create diagnostic plots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# 1. Residual Plot over time
ax1.plot(df_stl.index, df_stl['residual_multi'])
ax1.axhline(y=0, color='r', linestyle='--')
ax1.set_title('Residuals Over Time')
ax1.set_xlabel('Date')
ax1.set_ylabel('Residual')

# 2. Residual Histogram
sns.histplot(df_stl['residual_multi'].dropna(), kde=True, ax=ax2)
ax2.set_title('Residual Distribution')
ax2.set_xlabel('Residual')
ax2.set_ylabel('Frequency')

# 3. Q-Q Plot
from scipy import stats
stats.probplot(df_stl['residual_multi'].dropna(), dist="norm", plot=ax3)
ax3.set_title('Q-Q Plot')

# 4. Residual vs Fitted (Trend)
ax4.scatter(df_stl['trend_multi'], df_stl['residual_multi'])
ax4.axhline(y=0, color='r', linestyle='--')
ax4.set_title('Residual vs Fitted')
ax4.set_xlabel('Fitted Values (Trend)')
ax4.set_ylabel('Residuals')

plt.suptitle('Diagnostic Plots for Multipliacative Decomposition', fontsize=16)
plt.tight_layout()
plt.show()

# Residual Analysis
res = pd.DataFrame(seasonal_decomp_multi.resid)
res.columns = ['resid']

# Calculate statistics
res.resid_mean = res.resid.mean()
res.resid_std = res.resid.std()
lower_bound = res.resid_mean - 3 * res.resid_std
upper_bound = res.resid_mean + 3 * res.resid_std

# Plot the residuals
plt.figure(figsize=(12, 8))
plt.plot(res.resid)
plt.fill_between(res.resid.index, lower_bound, upper_bound, color='green', alpha=0.3)
plt.title('Multipliacative Residuals Analysis')
plt.xlim(res.resid.index[0], res.resid.index[-1])
plt.show()

# Anomaly Detection
anomalies = res.resid[(res.resid < lower_bound) | (res.resid > upper_bound)]
plt.figure(figsize=(12, 8))
plt.plot(raw_df['Close'])  
for i in anomalies.index:
    plt.axvline(i, color='r', linestyle='--', alpha=0.5)
plt.title('Multipliacative STL Time Series with Anomalies')
plt.show()

print(f'Dates of Anomalies: {anomalies.index}')

#forcast in by multiactive
estimated_multi = df_stl['trend_multi'] + df_stl['seasonal_multi']
plt.figure(figsize=(16, 9))
plt.plot(raw_df['Close'], label='Original')
plt.plot(estimated_multi, label='Estimated_Multi')
plt.title('Original vs Estimated(Multiple STL)')
plt.legend()
for year in range(2005, 2025):
    plt.axvline(datetime(year,1,1), color='k', linestyle='--', alpha=0.2)
plt.show()

# mean absolute percentage error and root mean square error
mape = np.mean(np.abs((raw_df['Close'] - estimated_multi) / raw_df['Close'])) * 100
rmse = np.mean((raw_df['Close'] - estimated_multi) ** 2) ** 0.5
print(f'Mean Absolute Percentage Error: {mape}')
print(f'Root Mean Square Error: {rmse}')

# Print Write Noice
white_noise_multi = acorr_ljungbox(df_stl['residual_multi'].dropna(), lags = [12], return_df=True)
white_noise_multi

# Print the p-value from the Ljung-Box test
print("\n=== Ljung-Box test Results ===")
print(f"Ljung-Box test p-value: {white_noise_multi['lb_pvalue'][12]}")

# Interpret the result
p_value = white_noise_multi['lb_pvalue'][12]
if p_value > 0.05:
    print("\nInterpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests the residuals are independent (white noise).")
else:
    print("\nInterpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests the residuals are not independent (not white noise).")
    
# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(df_stl['residual_multi'].dropna())

# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")

# Interpret LM test result
if lm_pval > 0.05:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests there is no ARCH effect in the residuals.")
else:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests there is ARCH effect present in the residuals.")
Dates of Anomalies: DatetimeIndex(['2020-04-01'], dtype='datetime64[ns]', name='Date', freq=None)
Mean Absolute Percentage Error: 7.114270037834504
Root Mean Square Error: 3.603418909131697

=== Ljung-Box test Results ===
Ljung-Box test p-value: 3.5063073205305405e-21

Interpretation:
Since p-value < 0.05, we reject the null hypothesis.
This suggests the residuals are not independent (not white noise).

=== ARCH LM Test Results ===
LM test statistic: 85.8593
LM test p-value: 3.5326063367906394e-14

ARCH LM Test Interpretation:
Since p-value < 0.05, we reject the null hypothesis.
This suggests there is ARCH effect present in the residuals.
In [46]:
#GARCH Model
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
plot_acf(df_stl['residual_multi'].dropna(), lags=40, zero=False, ax=ax2, title='Autocorrelation (For MA term)')
plot_pacf(df_stl['residual_multi'].dropna(), lags=40, zero=False, ax=ax1, title = 'Partial Autocorrelation (For AR term)')
plt.show()

mdl_garch = arch_model(df_stl['residual_multi'].dropna(), vol = 'GARCH', p = 1, q = 3)
garch_fit = mdl_garch.fit()
print(garch_fit.summary())

garch_std_resid = pd.Series(garch_fit.resid / garch_fit.conditional_volatility)
fig = plt.figure(figsize = (15, 8))

# Residual
garch_std_resid.plot(ax = fig.add_subplot(3,1,1), title = 'GARCH Standardized-Residual', legend = False)

# ACF/PACF
sgt.plot_acf(garch_std_resid, zero = False, lags = 40, ax=fig.add_subplot(3,2,4))
sgt.plot_pacf(garch_std_resid, zero = False, lags = 40, ax=fig.add_subplot(3,2,3))

# QQ-Plot & Norm-Dist
sm.qqplot(garch_std_resid, line='s', ax=fig.add_subplot(3,2,5)) 
plt.title("QQ Plot")
fig.add_subplot(3,2,6).hist(garch_std_resid, bins = 12)
plt.title("Histogram")

plt.tight_layout()
plt.show()

# Print Write Noice
white_noise_add = acorr_ljungbox(garch_std_resid, lags = [12], return_df=True)
white_noise_add

# Print the p-value from the Ljung-Box test
print("\n")
print(f"Ljung-Box test p-value: {white_noise_add['lb_pvalue'][12]}")

# Interpret the result
p_value = white_noise_add['lb_pvalue'][12]
if p_value > 0.05:
    print("\nInterpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests the residuals are independent (white noise).")
else:
    print("\nInterpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests the residuals are not independent (not white noise).")
    
# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(garch_std_resid)

# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")

# Interpret LM test result
if lm_pval > 0.05:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests there is no ARCH effect in the residuals.")
else:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests there is ARCH effect present in the residuals.")
Iteration:      1,   Func. Count:      8,   Neg. LLF: 2049087.0251516781
Iteration:      2,   Func. Count:     20,   Neg. LLF: 2692.340695813669
Iteration:      3,   Func. Count:     28,   Neg. LLF: -279.4323496771234
Iteration:      4,   Func. Count:     37,   Neg. LLF: -175.8420853791262
Iteration:      5,   Func. Count:     45,   Neg. LLF: -301.3392313723543
Iteration:      6,   Func. Count:     52,   Neg. LLF: -301.27062791660154
Iteration:      7,   Func. Count:     60,   Neg. LLF: -300.9407438011037
Iteration:      8,   Func. Count:     68,   Neg. LLF: -301.4506053907312
Iteration:      9,   Func. Count:     76,   Neg. LLF: -301.48019145227585
Iteration:     10,   Func. Count:     83,   Neg. LLF: -301.48022617358356
Iteration:     11,   Func. Count:     90,   Neg. LLF: -301.48022777839446
Iteration:     12,   Func. Count:     96,   Neg. LLF: -301.4802277677842
Optimization terminated successfully    (Exit mode 0)
            Current function value: -301.48022777839446
            Iterations: 12
            Function evaluations: 96
            Gradient evaluations: 12
                     Constant Mean - GARCH Model Results                      
==============================================================================
Dep. Variable:         residual_multi   R-squared:                       0.000
Mean Model:             Constant Mean   Adj. R-squared:                  0.000
Vol Model:                      GARCH   Log-Likelihood:                301.480
Distribution:                  Normal   AIC:                          -590.960
Method:            Maximum Likelihood   BIC:                          -570.384
                                        No. Observations:                  228
Date:                Wed, Mar 26 2025   Df Residuals:                      227
Time:                        23:56:44   Df Model:                            1
                               Mean Model                               
========================================================================
                 coef    std err          t      P>|t|  95.0% Conf. Int.
------------------------------------------------------------------------
mu             0.9927  7.905e-03    125.574      0.000 [  0.977,  1.008]
                               Volatility Model                              
=============================================================================
                 coef    std err          t      P>|t|       95.0% Conf. Int.
-----------------------------------------------------------------------------
omega      2.2933e-03  1.628e-03      1.408      0.159 [-8.982e-04,5.485e-03]
alpha[1]       0.3674      0.151      2.436  1.487e-02    [7.174e-02,  0.663]
beta[1]    2.2990e-16      0.223  1.029e-15      1.000      [ -0.438,  0.438]
beta[2]        0.1390      0.766      0.182      0.856      [ -1.362,  1.640]
beta[3]    1.6009e-16      0.414  3.870e-16      1.000      [ -0.811,  0.811]
=============================================================================

Covariance estimator: robust

Ljung-Box test p-value: 2.677826057842571e-15

Interpretation:
Since p-value < 0.05, we reject the null hypothesis.
This suggests the residuals are not independent (not white noise).

=== ARCH LM Test Results ===
LM test statistic: 2.5351
LM test p-value: 0.9903546162860751

ARCH LM Test Interpretation:
Since p-value > 0.05, we fail to reject the null hypothesis.
This suggests there is no ARCH effect in the residuals.
In [47]:
garch_fit.summary()
Out[47]:
Constant Mean - GARCH Model Results
Dep. Variable: residual_multi R-squared: 0.000
Mean Model: Constant Mean Adj. R-squared: 0.000
Vol Model: GARCH Log-Likelihood: 301.480
Distribution: Normal AIC: -590.960
Method: Maximum Likelihood BIC: -570.384
No. Observations: 228
Date: Wed, Mar 26 2025 Df Residuals: 227
Time: 23:56:44 Df Model: 1
Mean Model
coef std err t P>|t| 95.0% Conf. Int.
mu 0.9927 7.905e-03 125.574 0.000 [ 0.977, 1.008]
Volatility Model
coef std err t P>|t| 95.0% Conf. Int.
omega 2.2933e-03 1.628e-03 1.408 0.159 [-8.982e-04,5.485e-03]
alpha[1] 0.3674 0.151 2.436 1.487e-02 [7.174e-02, 0.663]
beta[1] 2.2990e-16 0.223 1.029e-15 1.000 [ -0.438, 0.438]
beta[2] 0.1390 0.766 0.182 0.856 [ -1.362, 1.640]
beta[3] 1.6009e-16 0.414 3.870e-16 1.000 [ -0.811, 0.811]


Covariance estimator: robust

ARIMA Model¶

In [48]:
#ARIMA Model

#Import data
arima_df = raw_df['Close']

#infer the frequency of the data
df_freq = arima_df.asfreq(pd.infer_freq(raw_df.index))

#ADF Test (for stationarity)
result = adfuller(arima_df.dropna())
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')
if result[1] <= 0.05:
    print('Data is stationary')
    diff_order = 0
else:
    print('Data is non-stationary')
    diff_order = 1

diff = arima_df.diff()[1:]
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
ax1.plot(arima_df)
ax1.set_title('Monthly Close Price')
ax2.plot(diff)
ax2.set_title('Monthly Close Price (Difference Once)')
plt.show()

#ACF and PACF plots for the data
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
if diff_order == 0:
    setting = arima_df
else:
    setting = diff
plot_acf(setting, lags=20, zero=False, ax=ax2, title='Autocorrelation (For MA term)')
plot_pacf(setting, lags=20, zero=False, ax=ax1, title = 'Partial Autocorrelation (For AR term)')
plt.show()

#Arima Model
pmd_mdl = pmd.auto_arima(y = arima_df,
                         d = diff_order)
print(pmd_mdl.summary())

# Get fitted values (in-sample predictions)
fitted_values = pd.Series(pmd_mdl.predict_in_sample(), index=arima_df.index)

# Calculate residuals
residuals = arima_df - fitted_values

# Create comparison DataFrame
comparison = pd.DataFrame({
    'Actual': arima_df,
    'Estimated': fitted_values,
    'Residuals': residuals
})

# Plotting
fig, (ax1) = plt.subplots(1, 1, figsize=(12, 8))

# Plot 1: Actual vs Estimated
ax1.plot(arima_df.index, arima_df, label='Actual', color='blue')
ax1.plot(arima_df.index, fitted_values, label='Estimated', color='red', linestyle='--')
ax1.set_title('Actual vs Estimated Values ARIMA Model')
ax1.legend()
ax1.grid(True)

# Plot 2: Residuals
ax2.plot(arima_df.index, residuals, color='green')
ax2.axhline(y=0, color='r', linestyle='--')
ax2.set_title('Residuals')
ax2.grid(True)

plt.tight_layout()
plt.show()

# Calculate performance metrics
mse = ((residuals) ** 2).mean()
rmse = np.sqrt(mse)
mae = abs(residuals).mean()
mape = (abs(residuals) / arima_df * 100).mean()

print("\nPerformance Metrics:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.4f}%")

pmd_mdl.plot_diagnostics(figsize = (15, 10))
plt.show()

# Calculate statistics
res.resid_mean = residuals.mean()
res.resid_std = residuals.std()
lower_bound = res.resid_mean - 3 * res.resid_std
upper_bound = res.resid_mean + 3 * res.resid_std

# Plot the residuals
plt.figure(figsize=(12, 8))
plt.plot(residuals)
plt.fill_between(residuals.index, lower_bound, upper_bound, color='green', alpha=0.3)
plt.title('ARIMA Residuals Analysis')
plt.xlim(residuals.index[0], residuals.index[-1])
plt.show()

# Anomaly Detection
anomalies = residuals[(residuals < lower_bound) | (residuals > upper_bound)]
plt.figure(figsize=(12, 8))
plt.plot(raw_df['Close'])
for i in anomalies.index:
    plt.axvline(i, color='r', linestyle='--', alpha=0.5)
plt.title('ARIMA  Time Series with Anomalies')
plt.show()

print(f'Dates of Anomalies: {anomalies.index}')


# Print Write Noice
white_noise_arima = acorr_ljungbox(residuals, lags = [12], return_df=True)
white_noise_arima

# Print the p-value from the Ljung-Box test
print("\n=== Ljung-Box test Results ===")
print(f"Ljung-Box test p-value: {white_noise_arima['lb_pvalue'][12]}")

# Interpret the result
p_value = white_noise_arima['lb_pvalue'][12]
if p_value > 0.05:
    print("\nInterpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests the residuals are independent (white noise).")
else:
    print("\nInterpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests the residuals are not independent (not white noise).")

# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(residuals)

# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")

# Interpret LM test result
if lm_pval > 0.05:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests there is no ARCH effect in the residuals.")
else:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests there is ARCH effect present in the residuals.")
ADF Statistic: -0.6472245123232313
p-value: 0.859950295455792
Data is non-stationary
                               SARIMAX Results                                
==============================================================================
Dep. Variable:                      y   No. Observations:                  240
Model:               SARIMAX(2, 1, 2)   Log Likelihood                -625.968
Date:                Wed, 26 Mar 2025   AIC                           1263.935
Time:                        23:56:46   BIC                           1284.794
Sample:                    01-01-2005   HQIC                          1272.341
                         - 12-01-2024                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept      0.6014      0.392      1.532      0.125      -0.168       1.371
ar.L1          0.2478      0.041      5.985      0.000       0.167       0.329
ar.L2         -0.9429      0.033    -28.417      0.000      -1.008      -0.878
ma.L1         -0.1825      0.040     -4.514      0.000      -0.262      -0.103
ma.L2          0.9445      0.037     25.434      0.000       0.872       1.017
sigma2        11.0018      0.807     13.636      0.000       9.420      12.583
===================================================================================
Ljung-Box (L1) (Q):                   0.57   Jarque-Bera (JB):                54.48
Prob(Q):                              0.45   Prob(JB):                         0.00
Heteroskedasticity (H):              18.10   Skew:                             0.42
Prob(H) (two-sided):                  0.00   Kurtosis:                         5.18
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Performance Metrics:
MSE: 11.0818
RMSE: 3.3289
MAE: 2.3417
MAPE: 5.9224%
Dates of Anomalies: DatetimeIndex(['2014-08-01', '2016-01-01', '2022-10-01'], dtype='datetime64[ns]', name='Date', freq=None)

=== Ljung-Box test Results ===
Ljung-Box test p-value: 0.3998817785935515

Interpretation:
Since p-value > 0.05, we fail to reject the null hypothesis.
This suggests the residuals are independent (white noise).

=== ARCH LM Test Results ===
LM test statistic: 25.6393
LM test p-value: 0.004256812590031472

ARCH LM Test Interpretation:
Since p-value < 0.05, we reject the null hypothesis.
This suggests there is ARCH effect present in the residuals.

Split into 12 step testing¶

In [49]:
# Data preparation
dummy_df = pd.read_csv(CSV_file)
dummy_df['Date'] = pd.to_datetime(dummy_df['Date'])
dummy_df = dummy_df.sort_values('Date')
dummy_df['Time_Period'] = range(1, len(dummy_df) + 1)
dummy_df['Close'] = pd.to_numeric(dummy_df['Close'], errors='coerce')
dummy_df = dummy_df.dropna()

# Create monthly dummy variables - ensure they're numeric
dummy_df['Month'] = dummy_df['Date'].dt.month
dummy_months = pd.get_dummies(dummy_df['Month'], prefix='Month').astype(float)

# Combine features and ensure numeric types
X = pd.concat([dummy_df[['Time_Period']].astype(float), dummy_months.iloc[:, :-1]], axis=1)
y = dummy_df['Close'].astype(float)

# Add constant term
X = sm.add_constant(X)

# Train/test split
train_mask = dummy_df['Date'] <= '2023-12-31'
test_mask = (dummy_df['Date'] > '2023-12-31') & (dummy_df['Date'] <= '2024-12-31')

# Convert to numpy arrays explicitly
X_train = X[train_mask].values.astype(float)
X_test = X[test_mask].values.astype(float)
y_train = y[train_mask].values.astype(float)
y_test = y[test_mask].values.astype(float)

# Model fitting
try:
    model = sm.OLS(y_train, X_train).fit()
except Exception as e:
    print("Error in model fitting:", e)
    # Debugging info
    print("X_train dtype:", X_train.dtype)
    print("y_train dtype:", y_train.dtype)
    print("NaN in X_train:", np.isnan(X_train).sum())
    print("NaN in y_train:", np.isnan(y_train).sum())
    raise

# Predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
all_pred = model.predict(X.values.astype(float))

# MAPE calculations
train_mape = mean_absolute_percentage_error(y_train, train_pred) * 100
test_mape = mean_absolute_percentage_error(y_test, test_pred) * 100
whole_mape = mean_absolute_percentage_error(y, all_pred) * 100

dva_train_mape = train_mape
dva_test_mape = test_mape
dva_whole_mape = whole_mape

print(f"Train MAPE: {train_mape:.2f}%")
print(f"Test MAPE: {test_mape:.2f}%")
print(f"Whole Data MAPE: {whole_mape:.2f}%")

# Plot
plt.figure(figsize=(12, 6))
plt.plot(dummy_df['Date'], y, label='Actual', color='blue')
plt.plot(dummy_df['Date'][train_mask], train_pred, label='Train Estimated', color='orange')
plt.plot(dummy_df['Date'][test_mask], test_pred, label='Test Estimated', color='green')
# Add vertical line to separate train and test
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
    f'Dummy Variable Model: Prediction\n'
    f'Train MAPE: {train_mape:.2f}%, '
    f'Test MAPE: {test_mape:.2f}%, '
    f'Whole Data MAPE: {whole_mape:.2f}%'
)
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Train MAPE: 27.19%
Test MAPE: 10.68%
Whole Data MAPE: 26.37%
In [50]:
# Data preparation
df_stl = pd.read_csv(CSV_file, usecols=['Date', 'Close'])
df_stl['Date'] = pd.to_datetime(df_stl['Date'])
df_stl.set_index('Date', inplace=True)

# Decomposition
seasonal_decomp_add = seasonal_decompose(df_stl, model='additive', period=12)
df_stl['trend_add'] = seasonal_decomp_add.trend
df_stl['seasonal_add'] = seasonal_decomp_add.seasonal

# Train/test split
train_mask = df_stl.index <= '2023-12-31'
test_mask = (df_stl.index > '2023-12-31') & (df_stl.index <= '2024-12-31')

# Create estimates
estimated_add = df_stl['trend_add'] + df_stl['seasonal_add']

# MAPE calculations
train_mape = np.mean(np.abs((df_stl['Close'][train_mask] - estimated_add[train_mask]) / df_stl['Close'][train_mask])) * 100
test_mape = np.mean(np.abs((df_stl['Close'][test_mask] - estimated_add[test_mask]) / df_stl['Close'][test_mask])) * 100
whole_mape = np.mean(np.abs((df_stl['Close'] - estimated_add) / df_stl['Close'])) * 100

add_train_mape = train_mape
add_test_mape = test_mape
add_whole_mape = whole_mape

print(f"Train MAPE: {train_mape:.2f}%")
print(f"Test MAPE: {test_mape:.2f}%")
print(f"Whole Data MAPE: {whole_mape:.2f}%")

# Plot
plt.figure(figsize=(12, 6))
plt.plot(df_stl['Close'], label='Actual', color='blue')
plt.plot(estimated_add[train_mask], label='Train Estimated', color='orange')
plt.plot(estimated_add[test_mask], label='Test Estimated', color='green')
# Add vertical line to separate train and test
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
    f'Additive STL: Prediction\n'
    f'Train MAPE: {train_mape:.2f}%, '
    f'Test MAPE: {test_mape:.2f}%, '
    f'Whole Data MAPE: {whole_mape:.2f}%'
)
plt.legend()
plt.tight_layout()
plt.show()
Train MAPE: 5.88%
Test MAPE: 8.64%
Whole Data MAPE: 5.95%
In [51]:
# Data preparation (same as additive)
df_stl = pd.read_csv(CSV_file, usecols=['Date', 'Close'])
df_stl['Date'] = pd.to_datetime(df_stl['Date'])
df_stl.set_index('Date', inplace=True)

# Decomposition
seasonal_decomp_mul = seasonal_decompose(df_stl, model='multiplicative', period=12)
df_stl['trend_mul'] = seasonal_decomp_mul.trend
df_stl['seasonal_mul'] = seasonal_decomp_mul.seasonal

# Train/test split
train_mask = df_stl.index <= '2023-12-31'
test_mask = (df_stl.index > '2023-12-31') & (df_stl.index <= '2024-12-31')

# Create estimates
estimated_mul = df_stl['trend_mul'] * df_stl['seasonal_mul']

# MAPE calculations
train_mape = np.mean(np.abs((df_stl['Close'][train_mask] - estimated_mul[train_mask]) / df_stl['Close'][train_mask])) * 100
test_mape = np.mean(np.abs((df_stl['Close'][test_mask] - estimated_mul[test_mask]) / df_stl['Close'][test_mask])) * 100
whole_mape = np.mean(np.abs((df_stl['Close'] - estimated_mul) / df_stl['Close'])) * 100

multi_train_mape = train_mape
multi_test_mape = test_mape
multi_whole_mape = whole_mape

print(f"Train MAPE: {train_mape:.2f}%")
print(f"Test MAPE: {test_mape:.2f}%")
print(f"Whole Data MAPE: {whole_mape:.2f}%")

# Plot
plt.figure(figsize=(12, 6))
plt.plot(df_stl['Close'], label='Actual', color='blue')
plt.plot(estimated_mul[train_mask], label='Train Estimated', color='orange')
plt.plot(estimated_mul[test_mask], label='Test Estimated', color='green')
# Add vertical line to separate train and test
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
    f'Multiplicative STL: Prediction\n'
    f'Train MAPE: {train_mape:.2f}%, '
    f'Test MAPE: {test_mape:.2f}%, '
    f'Whole Data MAPE: {whole_mape:.2f}%'
)
plt.legend()
plt.tight_layout()
plt.show()
Train MAPE: 5.36%
Test MAPE: 8.74%
Whole Data MAPE: 5.45%

Rolling Forecasting¶

In [52]:
#rolling forecasting by ARIMA
#Split the data
train_end = datetime(2023, 12, 31)
test_end = datetime(2024, 12, 31)

# Assuming you want to predict 'Close' column - adjust column name as needed
train_data = raw_df['Close'][:train_end]
test_data = raw_df['Close'][train_end + timedelta(days=1):test_end]
print(f'Train data shape: {train_data.shape}')
print(f'Test data shape: {test_data.shape}')

my_order = pmd_mdl.order

#using the rolling forecast origin
rolling_predictions = test_data.copy()
for train_end in test_data.index:
    train_data = raw_df['Close'][:train_end - timedelta(days=1)]
    model = SARIMAX(endog=train_data, order=my_order)
    model_fit = model.fit()
    pred = model_fit.forecast()
    rolling_predictions[train_end] = pred

print(model_fit.summary())

rolling_residuals = test_data - rolling_predictions
plt.figure(figsize=(10, 4))
plt.plot(rolling_residuals)
plt.axhline(0, linestyle='--', color='k')
plt.title('Rolling Forecast Residuals')
plt.ylabel('Error')

plt.figure(figsize=(10, 4))
plt.plot(test_data, label='Actual')
plt.plot(rolling_predictions, label='Predicted')
plt.title('Rolling Forecast')
plt.ylabel('Close Price')
plt.legend()

print('Mean Absolute Percentage Error:', round(np.mean(abs(rolling_residuals/test_data))*100, 4))
print('Root Mean Squared Error:', np.sqrt(np.mean(rolling_residuals**2)))
Train data shape: (228,)
Test data shape: (12,)
                               SARIMAX Results                                
==============================================================================
Dep. Variable:                  Close   No. Observations:                  239
Model:               SARIMAX(2, 1, 2)   Log Likelihood                -625.069
Date:                Wed, 26 Mar 2025   AIC                           1260.137
Time:                        23:56:49   BIC                           1277.499
Sample:                    01-01-2005   HQIC                          1267.134
                         - 11-01-2024                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.2476      0.045      5.520      0.000       0.160       0.335
ar.L2         -0.9381      0.035    -26.485      0.000      -1.007      -0.869
ma.L1         -0.1789      0.043     -4.114      0.000      -0.264      -0.094
ma.L2          0.9404      0.039     24.065      0.000       0.864       1.017
sigma2        11.1611      0.795     14.043      0.000       9.603      12.719
===================================================================================
Ljung-Box (L1) (Q):                   0.61   Jarque-Bera (JB):                51.69
Prob(Q):                              0.44   Prob(JB):                         0.00
Heteroskedasticity (H):              19.31   Skew:                             0.41
Prob(H) (two-sided):                  0.00   Kurtosis:                         5.13
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Mean Absolute Percentage Error: 5.4819
Root Mean Squared Error: 4.648457454156803
In [53]:
#Split the data
train_end = datetime(2023, 12, 31)
test_end = datetime(2024, 12, 31)

# Assuming you want to predict 'Close' column
train_data = raw_df['Close'][:train_end]
test_data = raw_df['Close'][train_end + timedelta(days=1):test_end]
print(f'Train data shape: {train_data.shape}')
print(f'Test data shape: {test_data.shape}')

my_order = pmd_mdl.order

# Get training predictions
train_predictions = train_data.copy()
for train_idx in train_data.index[50:]:  # Starting from 50 to have enough data for initial prediction
    temp_train = raw_df['Close'][:train_idx - timedelta(days=1)]
    model = SARIMAX(endog=temp_train, order=my_order)
    model_fit = model.fit()
    pred = model_fit.forecast()
    train_predictions[train_idx] = pred
    
# Calculate training residuals (add this after your train_predictions calculation)
train_residuals = train_data[train_predictions.index] - train_predictions  # Align indices first
train_residuals = train_residuals.dropna()  

# Get testing predictions
rolling_predictions = test_data.copy()
for train_end in test_data.index:
    train_data = raw_df['Close'][:train_end - timedelta(days=1)]
    model = SARIMAX(endog=train_data, order=my_order)
    model_fit = model.fit()
    pred = model_fit.forecast()
    rolling_predictions[train_end] = pred

# Calculate metrics for test set
rolling_residuals = test_data - rolling_predictions
train_residuals = train_data - train_predictions

# Calculate MAPE

# Calculate MAPE for test set (in percentage)
test_mape = round(np.mean(abs(rolling_residuals/test_data)) * 100, 2)  # Multiply by 100 for percentage
print('Test Set Mean Absolute Percentage Error:', test_mape, '%')

# Calculate MAPE for training set (in percentage)
train_mape = round(np.mean(abs(train_residuals/train_data[train_residuals.index])) * 100, 2)
print('Training Set Mean Absolute Percentage Error:', train_mape, '%')

# Calculate MAPE for full dataset (in percentage)
full_predictions = pd.concat([train_predictions, rolling_predictions])
full_actual = pd.concat([train_data, test_data])
full_residuals = full_actual - full_predictions
full_mape = round(np.mean(abs(full_residuals/full_actual)) * 100, 2)  # Multiply by 100 for percentage
print('Full Dataset Mean Absolute Percentage Error:', full_mape, '%')


print('Root Mean Squared Error:', np.sqrt(np.mean(rolling_residuals**2)))

arima_train_mape = train_mape
arima_test_mape = test_mape
arima_whole_mape = full_mape

# Plotting
plt.figure(figsize=(15, 6))
plt.plot(raw_df['Close'], label='Actual', color='blue')
plt.plot(train_predictions, label='Training Predictions', color='orange')
plt.plot(rolling_predictions, label='Testing Predictions', color='green')

# Add vertical line to separate train and test
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
    f'ARIMA Model: Prediction\n'
    f'Train MAPE: {train_mape:.2f}%, '
    f'Test MAPE: {test_mape:.2f}%, '
    f'Whole Data MAPE: {full_mape:.2f}%'
)
plt.ylabel('Close Price')
plt.legend()
for year in range(2005, 2025):
    plt.axvline(datetime(year,1,1), color='k', linestyle='--', alpha=0.2)
plt.show()

# Plot residuals
plt.figure(figsize=(15, 4))
plt.plot(rolling_residuals)
plt.axhline(0, linestyle='--', color='k')
plt.title('Testing Forecast Residuals')
plt.ylabel('Error')
plt.grid(True, alpha=0.3)
plt.show()

# Print Write Noice
white_noise_arima = acorr_ljungbox(rolling_residuals, lags = [1], return_df=True)
white_noise_arima

# Print the p-value from the Ljung-Box test
print("\n=== Ljung-Box test Results ===")
print(f"Ljung-Box test p-value: {white_noise_arima['lb_pvalue'][1]}")

# Interpret the result
p_value = white_noise_arima['lb_pvalue'][1]
if p_value > 0.05:
    print("\nInterpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests the residuals are independent (white noise).")
else:
    print("\nInterpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests the residuals are not independent (not white noise).")

# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(rolling_residuals)

# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")

# Interpret LM test result
if lm_pval > 0.05:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value > 0.05, we fail to reject the null hypothesis.")
    print("This suggests there is no ARCH effect in the residuals.")
else:
    print("\nARCH LM Test Interpretation:")
    print("Since p-value < 0.05, we reject the null hypothesis.")
    print("This suggests there is ARCH effect present in the residuals.")
Train data shape: (228,)
Test data shape: (12,)
Test Set Mean Absolute Percentage Error: 5.48 %
Training Set Mean Absolute Percentage Error: 4.67 %
Full Dataset Mean Absolute Percentage Error: 4.85 %
Root Mean Squared Error: 4.648457454156803
=== Ljung-Box test Results ===
Ljung-Box test p-value: 0.21585103820068754

Interpretation:
Since p-value > 0.05, we fail to reject the null hypothesis.
This suggests the residuals are independent (white noise).

=== ARCH LM Test Results ===
LM test statistic: 5.5706
LM test p-value: 0.06171122350105976

ARCH LM Test Interpretation:
Since p-value > 0.05, we fail to reject the null hypothesis.
This suggests there is no ARCH effect in the residuals.
In [54]:
# Plotting
plt.figure(figsize=(15, 6))
plt.plot(raw_df['Close'], label='Actual', color='blue')
plt.plot(train_predictions, label='Training Predictions', color='orange')
plt.plot(rolling_predictions, label='Testing Predictions', color='green')

# Add vertical line to separate train and test
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
    f'ARIMA Model: Prediction\n'
    f'Train MAPE: {train_mape:.2f}%, '
    f'Test MAPE: {test_mape:.2f}%, '
    f'Whole Data MAPE: {full_mape:.2f}%'
)
plt.ylabel('Close Price')
plt.legend()
for year in range(2005, 2025):
    plt.axvline(datetime(year,1,1), color='k', linestyle='--', alpha=0.2)
plt.show()
In [55]:
#plot the split data (4 graphs)
plt.figure(figsize=(20, 12))

# 1. Dummy Variable Model (Top Left)
plt.subplot(2, 2, 1)
plt.plot(dummy_df['Date'], y, label='Actual', color='blue')
plt.plot(dummy_df['Date'][train_mask], train_pred, label='Train Estimated', color='orange')
plt.plot(dummy_df['Date'][test_mask], test_pred, label='Test Estimated', color='green')
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
    f'Dummy Variable Model: Prediction\n'
    f'Train MAPE: {dva_train_mape:.2f}%, Test MAPE: {dva_test_mape:.2f}%, Whole MAPE:{dva_whole_mape:.2f}%'
)
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.xticks(rotation=45)

# 2. Additive STL (Top Right)
plt.subplot(2, 2, 2)
plt.plot(df_stl['Close'], label='Actual', color='blue')
plt.plot(estimated_add[train_mask], label='Train Estimated', color='orange')
plt.plot(estimated_add[test_mask], label='Test Estimated', color='green')
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
    f'Additive STL: Prediction\n'
    f'Train MAPE: {add_train_mape:.2f}%, Test MAPE: {add_test_mape:.2f}%, Whole MAPE:{add_whole_mape:.2f}%'
)
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.xticks(rotation=45)

# 3. Multiplicative STL (Bottom Left)
plt.subplot(2, 2, 3)
plt.plot(df_stl['Close'], label='Actual', color='blue')
plt.plot(estimated_mul[train_mask], label='Train Estimated', color='orange')
plt.plot(estimated_mul[test_mask], label='Test Estimated', color='green')
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
    f'Multiplicative STL: Prediction\n'
    f'Train MAPE: {multi_train_mape:.2f}%, Test MAPE: {multi_test_mape:.2f}%, Whole MAPE:{multi_whole_mape:.2f}%'
)
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.xticks(rotation=45)

# 4. ARIMA Model (Bottom Right)
plt.subplot(2, 2, 4)
plt.plot(raw_df['Close'], label='Actual', color='blue')
plt.plot(train_predictions, label='Training Predictions', color='orange')
plt.plot(rolling_predictions, label='Testing Predictions', color='green')
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
    f'ARIMA Model: Prediction\n'
    f'Train MAPE: {arima_train_mape:.2f}%, Test MAPE: {arima_test_mape:.2f}%, Whole MAPE:{arima_whole_mape:.2f}%'
)
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()